For the number of topics, we focus on \(K + 1 (=4)\).
Description of data
| Parameters | Value |
|---|---|
| # Document | \(1000\) |
| Length of documents | Poisson( \(200\) ) |
| \(\alpha\) (control topic) | (0.953, 0.644, 1.091, 0.508, 1.216, 0.741) |
When we look at the vector \(\alpha\), we learn that words in topic 2 appear less frequent then other topics. This could be the reason that estimation of topic 2 is sometimes instable in the following results.
docs <- list.files(folder, pattern = "*.txt", full.names = TRUE)
explore_ <- explore(docs,
remove_numbers = FALSE, # For simulation, make it false
remove_punct = TRUE,
remove_symbols = TRUE,
remove_separators = TRUE)
explore_$data_tfidf %>%
group_by(term) %>%
summarize(TFIDFMedian = median(tf_idf)) %>%
arrange(desc(TFIDFMedian)) -> ranked_tfidf
We include one or two seeds from other topic (topic 3) in the keyword set of topic 1.
seed_list <- list(c("w1t1 w367t1 w80t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam1, n = 25, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1 [✓]" "w541t2 [✓]" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w536t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w80t3 [✓]" "w0t2" "w8t3" "w118t5"
## [5,] "w72t1" "w1245t2" "w87t3" "w19t5"
## [6,] "w75t1" "w122t2" "w77t3" "w45t6"
## [7,] "w63t1" "w164t2" "w127t3" "w651t6"
## [8,] "w449t1" "w619t2" "w264t3" "w185t6"
## [9,] "w60t1" "w23t2" "w153t3" "w55t6"
## [10,] "w832t1" "w725t2" "w115t3" "w58t5"
## [11,] "w11t1" "w623t2" "w143t3" "w37t6"
## [12,] "w405t1" "w943t2" "w263t3" "w209t5"
## [13,] "w1073t1" "w979t2" "w92t3" "w107t5"
## [14,] "w833t1" "w606t2" "w867t3" "w66t5"
## [15,] "w113t1" "w1501t2" "w593t3" "w129t5"
## [16,] "w273t1" "w158t2" "w949t3" "w33t4"
## [17,] "w404t1" "w134t2" "w222t3" "w368t5"
## [18,] "w21t1" "w724t2" "w884t3" "w269t5"
## [19,] "w403t1" "w556t2" "w953t3" "w91t5"
## [20,] "w808t1" "w568t2" "w196t3" "w116t5"
## [21,] "w1214t1" "w129t2" "w548t3" "w379t5"
## [22,] "w274t1" "w886t2" "w848t3" "w160t6"
## [23,] "w391t1" "w159t2" "w80t3 [1]" "w400t5"
## [24,] "w768t1" "w629t2" "w245t3" "w686t5"
## [25,] "w994t1" "w170t2" "w103t3" "w430t5"
diagnosis_topic_recovery_heatmap(contam1, 25)
lda4
diagnosis_model_fit(contam1, start=2)
seed_list <- list(c("w1t1 w367t1 w213t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam2, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1 [✓]" "w33t4" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w541t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w72t1" "w536t2 [✓]" "w80t3" "w118t5"
## [5,] "w75t1" "w0t2" "w8t3" "w19t5"
## [6,] "w63t1" "w1245t2" "w87t3" "w45t6"
## [7,] "w832t1" "w83t4" "w77t3" "w651t6"
## [8,] "w449t1" "w122t2" "w127t3" "w185t6"
## [9,] "w60t1" "w164t2" "w264t3" "w55t6"
## [10,] "w11t1" "w619t2" "w153t3" "w58t5"
diagnosis_topic_recovery_heatmap(contam2, 25)
lda4
diagnosis_model_fit(contam2, start=2)
seed_list <- list(c("w1t1 w367t1 w206t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam3, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1 [✓]" "w541t2 [✓]" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w536t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w33t4" "w0t2" "w80t3" "w118t5"
## [5,] "w72t1" "w1245t2" "w8t3" "w19t5"
## [6,] "w75t1" "w122t2" "w87t3" "w45t6"
## [7,] "w63t1" "w164t2" "w77t3" "w651t6"
## [8,] "w449t1" "w619t2" "w127t3" "w185t6"
## [9,] "w60t1" "w23t2" "w264t3" "w55t6"
## [10,] "w11t1" "w725t2" "w153t3" "w58t5"
diagnosis_topic_recovery_heatmap(contam3, 25)
lda4
diagnosis_model_fit(contam3, start=2)
seed_list <- list(c("w1t1 w213t3 w387t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam4, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1" "w541t2 [✓]" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w536t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w72t1" "w0t2" "w80t3" "w118t5"
## [5,] "w75t1" "w1245t2" "w8t3" "w19t5"
## [6,] "w63t1" "w122t2" "w87t3" "w45t6"
## [7,] "w832t1" "w164t2" "w77t3" "w651t6"
## [8,] "w449t1" "w619t2" "w127t3" "w185t6"
## [9,] "w60t1" "w23t2" "w264t3" "w55t6"
## [10,] "w11t1" "w725t2" "w153t3" "w58t5"
diagnosis_topic_recovery_heatmap(contam4, 25)
lda4
diagnosis_model_fit(contam4, start=2)
seed_list <- list(c("w1t1 w1492t3 w206t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam5, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1" "w33t4" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w541t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w72t1" "w536t2 [✓]" "w80t3" "w118t5"
## [5,] "w75t1" "w0t2" "w8t3" "w19t5"
## [6,] "w63t1" "w1245t2" "w87t3" "w45t6"
## [7,] "w449t1" "w83t4" "w77t3" "w651t6"
## [8,] "w832t1" "w122t2" "w127t3" "w55t6"
## [9,] "w60t1" "w164t2" "w264t3" "w185t6"
## [10,] "w11t1" "w619t2" "w153t3" "w58t5"
diagnosis_topic_recovery_heatmap(contam5, 25)
lda4
diagnosis_model_fit(contam5, start=2)
seed_list <- list(c("w1281t1 w2461t1 w213t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam6, show_seed = T)
## 1 2 3 T_1
## [1,] "w33t5" "w223t2 [✓]" "w26t3 [✓]" "w173t6"
## [2,] "w65t5" "w541t2 [✓]" "w6t3 [✓]" "w45t6"
## [3,] "w118t5" "w536t2 [✓]" "w223t3 [✓]" "w651t6"
## [4,] "w1t1" "w0t2" "w80t3" "w185t6"
## [5,] "w19t5" "w1245t2" "w8t3" "w55t6"
## [6,] "w367t1" "w122t2" "w87t3" "w37t6"
## [7,] "w180t1" "w164t2" "w77t3" "w33t4"
## [8,] "w58t5" "w619t2" "w127t3" "w160t6"
## [9,] "w209t5" "w23t2" "w264t3" "w83t4"
## [10,] "w107t5" "w725t2" "w153t3" "w50t6"
diagnosis_topic_recovery_heatmap(contam6, 25, topicvec = c(1,2,3,4))
lda4
diagnosis_model_fit(contam6, start=2)
seed_list <- list(c("w1281t1 w2461t1 w206t3"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(contam7, show_seed = T)
## 1 2 3 T_1
## [1,] "w33t5" "w223t2 [✓]" "w26t3 [✓]" "w173t6"
## [2,] "w65t5" "w541t2 [✓]" "w6t3 [✓]" "w45t6"
## [3,] "w118t5" "w536t2 [✓]" "w223t3 [✓]" "w651t6"
## [4,] "w1t1" "w0t2" "w80t3" "w185t6"
## [5,] "w19t5" "w1245t2" "w8t3" "w55t6"
## [6,] "w367t1" "w122t2" "w87t3" "w37t6"
## [7,] "w180t1" "w164t2" "w77t3" "w33t4"
## [8,] "w209t5" "w619t2" "w127t3" "w160t6"
## [9,] "w107t5" "w23t2" "w264t3" "w83t4"
## [10,] "w58t5" "w725t2" "w153t3" "w50t6"
diagnosis_topic_recovery_heatmap(contam7, 25, topicvec = c(1,2,3,4))
lda4
diagnosis_model_fit(contam7, start=2)
seed_list <- list(c("w1t1 w367t1 w180t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
explore_$visualize_tfidf(seed_list)
## $density
##
## $median
top_terms(tf1, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w173t6" "w26t3 [✓]" "w33t5"
## [2,] "w367t1 [✓]" "w45t6" "w6t3 [✓]" "w65t5"
## [3,] "w180t1 [✓]" "w651t6" "w223t3 [✓]" "w118t5"
## [4,] "w72t1" "w185t6" "w80t3" "w19t5"
## [5,] "w75t1" "w223t2 [✓]" "w8t3" "w58t5"
## [6,] "w63t1" "w55t6" "w87t3" "w209t5"
## [7,] "w449t1" "w37t6" "w77t3" "w107t5"
## [8,] "w832t1" "w160t6" "w127t3" "w66t5"
## [9,] "w60t1" "w541t2 [✓]" "w264t3" "w129t5"
## [10,] "w11t1" "w536t2 [✓]" "w153t3" "w368t5"
diagnosis_topic_recovery_heatmap(tf1, 25)
lda4
diagnosis_model_fit(tf1, start=2)
seed_list <- list(c("w1t1 w367t1 w1671t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix1, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w173t6" "w26t3 [✓]" "w33t5"
## [2,] "w367t1 [✓]" "w45t6" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w651t6" "w223t3 [✓]" "w118t5"
## [4,] "w72t1" "w185t6" "w80t3" "w19t5"
## [5,] "w75t1" "w223t2 [✓]" "w8t3" "w58t5"
## [6,] "w63t1" "w55t6" "w87t3" "w209t5"
## [7,] "w832t1" "w37t6" "w77t3" "w107t5"
## [8,] "w449t1" "w160t6" "w127t3" "w66t5"
## [9,] "w60t1" "w541t2 [✓]" "w264t3" "w129t5"
## [10,] "w11t1" "w536t2 [✓]" "w153t3" "w33t4"
diagnosis_topic_recovery_heatmap(mix1, 25)
lda4
diagnosis_model_fit(mix1, start=2)
seed_list <- list(c("w1t1 w367t1 w2532t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix2, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1 [✓]" "w541t2 [✓]" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w536t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w72t1" "w0t2" "w80t3" "w118t5"
## [5,] "w75t1" "w1245t2" "w8t3" "w19t5"
## [6,] "w63t1" "w122t2" "w87t3" "w45t6"
## [7,] "w832t1" "w164t2" "w77t3" "w185t6"
## [8,] "w449t1" "w619t2" "w127t3" "w651t6"
## [9,] "w60t1" "w23t2" "w264t3" "w55t6"
## [10,] "w11t1" "w725t2" "w153t3" "w58t5"
diagnosis_topic_recovery_heatmap(mix2, 25)
lda4
diagnosis_model_fit(mix2, start=2)
seed_list <- list(c("w1t1 w1185t1 w1671t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix3, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w367t1" "w541t2 [✓]" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w536t2 [✓]" "w223t3 [✓]" "w173t6"
## [4,] "w72t1" "w0t2" "w80t3" "w118t5"
## [5,] "w75t1" "w1245t2" "w8t3" "w19t5"
## [6,] "w63t1" "w122t2" "w87t3" "w45t6"
## [7,] "w449t1" "w164t2" "w77t3" "w651t6"
## [8,] "w832t1" "w619t2" "w127t3" "w185t6"
## [9,] "w11t1" "w23t2" "w264t3" "w55t6"
## [10,] "w405t1" "w725t2" "w153t3" "w58t5"
diagnosis_topic_recovery_heatmap(mix3, 25)
lda4
diagnosis_model_fit(mix3, start=2)
seed_list <- list(c("w1t1 w2461t1 w2532t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix4, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w173t6" "w26t3 [✓]" "w33t5"
## [2,] "w367t1" "w45t6" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w651t6" "w223t3 [✓]" "w118t5"
## [4,] "w72t1" "w185t6" "w80t3" "w19t5"
## [5,] "w75t1" "w223t2 [✓]" "w8t3" "w58t5"
## [6,] "w63t1" "w55t6" "w87t3" "w209t5"
## [7,] "w832t1" "w37t6" "w77t3" "w107t5"
## [8,] "w449t1" "w160t6" "w127t3" "w66t5"
## [9,] "w60t1" "w541t2 [✓]" "w264t3" "w129t5"
## [10,] "w11t1" "w536t2 [✓]" "w153t3" "w33t4"
diagnosis_topic_recovery_heatmap(mix4, 25)
lda4
diagnosis_model_fit(mix4, start=2)
seed_list <- list(c("w1281t1 w2461t1 w2532t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix5, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1" "w173t6" "w26t3 [✓]" "w33t5"
## [2,] "w367t1" "w45t6" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w651t6" "w223t3 [✓]" "w118t5"
## [4,] "w33t4" "w185t6" "w80t3" "w19t5"
## [5,] "w72t1" "w223t2 [✓]" "w8t3" "w58t5"
## [6,] "w75t1" "w55t6" "w87t3" "w209t5"
## [7,] "w63t1" "w37t6" "w77t3" "w107t5"
## [8,] "w832t1" "w541t2 [✓]" "w127t3" "w66t5"
## [9,] "w449t1" "w160t6" "w264t3" "w129t5"
## [10,] "w60t1" "w536t2 [✓]" "w153t3" "w368t5"
diagnosis_topic_recovery_heatmap(mix5, 25)
lda4
diagnosis_model_fit(mix5, start=2)
seed_list <- list(c("w1281t1 w1185t1 w2532t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix6, show_seed = T)
## 1 2 3 T_1
## [1,] "w173t6" "w223t2 [✓]" "w26t3 [✓]" "w33t5"
## [2,] "w1t1" "w541t2 [✓]" "w6t3 [✓]" "w65t5"
## [3,] "w45t6" "w536t2 [✓]" "w223t3 [✓]" "w118t5"
## [4,] "w367t1" "w0t2" "w80t3" "w19t5"
## [5,] "w651t6" "w1245t2" "w8t3" "w58t5"
## [6,] "w185t6" "w122t2" "w87t3" "w209t5"
## [7,] "w180t1" "w164t2" "w77t3" "w107t5"
## [8,] "w37t6" "w619t2" "w127t3" "w66t5"
## [9,] "w55t6" "w23t2" "w264t3" "w129t5"
## [10,] "w72t1" "w725t2" "w153t3" "w33t4"
diagnosis_topic_recovery_heatmap(mix6, 25, topicvec = c(1,2,3,4))
lda4
diagnosis_model_fit(mix6, start=2)
seed_list <- list(c("w1t1 w1185t1 w2532t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
top_terms(mix7, show_seed = T)
## 1 2 3 T_1
## [1,] "w1t1 [✓]" "w173t6" "w26t3 [✓]" "w33t5"
## [2,] "w367t1" "w45t6" "w6t3 [✓]" "w65t5"
## [3,] "w180t1" "w651t6" "w223t3 [✓]" "w118t5"
## [4,] "w72t1" "w185t6" "w80t3" "w19t5"
## [5,] "w75t1" "w223t2 [✓]" "w8t3" "w58t5"
## [6,] "w63t1" "w55t6" "w87t3" "w209t5"
## [7,] "w832t1" "w37t6" "w77t3" "w107t5"
## [8,] "w449t1" "w33t4" "w127t3" "w66t5"
## [9,] "w60t1" "w541t2 [✓]" "w264t3" "w129t5"
## [10,] "w11t1" "w160t6" "w153t3" "w368t5"
diagnosis_topic_recovery_heatmap(mix7, 25, topicvec = c(1,2,3,4))
lda4
diagnosis_model_fit(mix7, start=2)
seed_list <- list(c("w2132t1 w2461t1 w2532t1"),
c("w223t2 w541t2 w536t2"),
c("w26t3 w6t3 w223t3"))
explore_$visualize_dict_prop(seed_list)
explore_$visualize_tfidf(seed_list)
## $density
##
## $median
top_terms(tf_sp, show_seed = T)
## 1 2 3 T_1
## [1,] "w33t5" "w223t2 [✓]" "w26t3 [✓]" "w173t6"
## [2,] "w65t5" "w541t2 [✓]" "w6t3 [✓]" "w45t6"
## [3,] "w118t5" "w536t2 [✓]" "w223t3 [✓]" "w651t6"
## [4,] "w1t1" "w0t2" "w80t3" "w185t6"
## [5,] "w19t5" "w1245t2" "w8t3" "w55t6"
## [6,] "w367t1" "w122t2" "w87t3" "w37t6"
## [7,] "w180t1" "w164t2" "w77t3" "w33t4"
## [8,] "w58t5" "w619t2" "w127t3" "w160t6"
## [9,] "w209t5" "w23t2" "w264t3" "w83t4"
## [10,] "w107t5" "w725t2" "w153t3" "w50t6"
diagnosis_topic_recovery_heatmap(tf_sp, 25, topicvec = c(1,2,3,4))
lda4
diagnosis_model_fit(tf_sp, start=2)